In [82]:
# %sh
# wget https://raw.githubusercontent.com/fivethirtyeight/data/master/police-killings/police_killings.csv
In [83]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
police_killings = pd.read_csv("police_killings.csv")
police_killings.head()
Out[83]:
In [84]:
print(police_killings.columns.values)
In [85]:
print(police_killings["raceethnicity"].value_counts())
In [86]:
raceethnicity_killings = police_killings["raceethnicity"].value_counts()
raceethnicity_killings.plot(kind="bar")
plt.show()
plt.close()
In [87]:
# Filter "-" records
income = police_killings[police_killings["p_income"] != "-"]
print("police_killings: {0}".format(police_killings.shape))
print("income: {0}".format(income.shape))
# Convert p_income data type to integer
income["p_income"] = income["p_income"].astype(int)
print(police_killings["p_income"].dtypes)
print(income["p_income"].dtypes)
# Plot a histogram
income["p_income"].hist(bins=20)
plt.show()
plt.close()
In [88]:
# https://www.census.gov/popest/data/state/asrh/2015/files/SCPRC-EST2015-18+POP-RES.csv
state_pop = pd.read_csv("SCPRC-EST2015-18+POP-RES.csv")
state_pop.head()
Out[88]:
In [89]:
state_pop.dtypes
Out[89]:
In [90]:
print(police_killings["state_fp"].head())
print(state_pop["STATE"].head())
In [91]:
counts = police_killings["state_fp"].value_counts()
In [92]:
# Create new dataframe
states = pd.DataFrame({
"STATE": counts.index,
"shootings": counts
})
states.head()
Out[92]:
In [93]:
# Join shootings with state population
states = states.merge(state_pop[["STATE", "NAME", "POPESTIMATE2015"]], on="STATE")
states.head()
Out[93]:
In [94]:
# Create population (in millions) column
states["pop_millions"] = states["POPESTIMATE2015"] / 1000000
states.head()
Out[94]:
In [95]:
# Create shootings per million people column
states["rate"] = states["shootings"] / states["pop_millions"]
states = states.sort_values(by="rate", ascending=False)
states.head()
Out[95]:
In [100]:
share = ["share_black", "share_white", "share_hispanic"]
share_filter = (police_killings["share_black"] != "-") & (police_killings["share_white"] != "-") & \
(police_killings["share_hispanic"] != "-")
pk = police_killings[share_filter]
print(police_killings.shape)
print(pk.shape)
pk[share] = pk[share].astype(float)
In [106]:
highest_10 = states["STATE"].head(10)
lowest_10 = states["STATE"].tail(10)
highest_10_df = police_killings[police_killings["state_fp"].isin(highest_10)]
lowest_10_df = police_killings[police_killings["state_fp"].isin(lowest_10)]
print(highest_10_df.shape)
print(lowest_10_df.shape)
print(highest_10_df.head())
print(lowest_10_df.head())
In [146]:
highest_mean = pd.Series(highest_10_df.mean(), name="highest")
lowest_mean = pd.Series(lowest_10_df.mean(), name="lowest")
compared_mean = pd.concat([highest_mean, lowest_mean], axis=1)
pd.options.display.float_format = '{:20,.2f}'.format
print(compared_mean)
In [ ]: